Example: Web Downloads in Three Styles

A Sequential Download Script


In [1]:
import os
import time
import sys

import requests

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()

# BASE_URL = 'http://flupy.org/data/flags'
BASE_URL = 'http://104.155.196.181:8001/flags'

DEST_DIR = 'downloads/'

def save_flag(img, filename):
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)
        
def get_flag(cc):
    url = '{}/{cc}/{cc}.gif'.format(BASE_URL,cc=cc.lower())
    resp = requests.get(url)
    return resp.content

def show(text):
    print(text, end=' ')
    sys.stdout.flush()
    
def download_many(cc_list):
    for cc in sorted(cc_list):
        image = get_flag(cc)
        show(cc)
        save_flag(image, cc.lower() + '.gif')
        
    return len(cc_list)

def main(download_many):
    t0 = time.time()
    count = download_many(POP20_CC)
    elapsed = time.time() - t0
    msg = '\n{} flags downloaded in {:.2f}s'
    print(msg.format(count, elapsed))
    
if __name__ == '__main__':
    main(download_many)


BD BR CD CN DE EG ET FR ID IN IR JP MX NG PH PK RU TR US VN 
20 flags downloaded in 9.00s

In [2]:
from concurrent import futures

from flags import save_flag, get_flag, show, main

MAX_WORKERS = 20

def download_one(cc):
    image = get_flag(cc)
    show(cc)
    save_flag(image, cc.lower() + '.gif')
    return cc

def download_many(cc_list):
    workers = min(MAX_WORKERS, len(cc_list))
    with futures.ThreadPoolExecutor(workers) as executor:
        res = executor.map(download_one, sorted(cc_list))
        
    return len(list(res))

if __name__ == '__main__':
    main(download_many)


ID IN FR BR DE JP CN BD NG RU VN TR EG PK CDMX  ET PH US IR 
20 flags downloaded in 1.36s

In [3]:
def download_many(cc_list):
    cc_list = cc_list[:5]
    with futures.ThreadPoolExecutor(max_workers=3) as executor:
        to_do = []
        for cc in sorted(cc_list):
            future = executor.submit(download_one, cc)
            to_do.append(future)
            msg = 'Scheduled for {}: {}'
            print(msg.format(cc, future))
            
        results = []
        for future in futures.as_completed(to_do):
            res = future.result()
            msg = '{} result: {!r}'
            print(msg.format(future, res))
            results.append(res)
        
    return len(results)

if __name__ == '__main__':
    main(download_many)


Scheduled for BR: <Future at 0x893e86438 state=running>
Scheduled for CN: <Future at 0x893e860f0 state=running>
Scheduled for ID: <Future at 0x893e5d400 state=running>
Scheduled for IN: <Future at 0x893e5d8d0 state=pending>
Scheduled for US: <Future at 0x893eab978 state=pending>
CNID BR  <Future at 0x893e860f0 state=finished returned str> result: 'CN'
<Future at 0x893e5d400 state=finished returned str> result: 'ID'
<Future at 0x893e86438 state=finished returned str> result: 'BR'
IN <Future at 0x893e5d8d0 state=finished returned str> result: 'IN'
US <Future at 0x893eab978 state=finished returned str> result: 'US'

5 flags downloaded in 1.05s

Launching Processes with concurrent.futures


In [5]:
from concurrent import futures

from flags import save_flag, get_flag, show, main

MAX_WORKERS = 20

def download_one(cc):
    image = get_flag(cc)
    show(cc)
    save_flag(image, cc.lower() + '.gif')
    return cc

def download_many(cc_list):
    #workers = min(MAX_WORKERS, len(cc_list))
    #with futures.ThreadPoolExecutor(workers) as executor:
    with futures.ProcessPoolExecutor() as executor:
        res = executor.map(download_one, sorted(cc_list))
        
    return len(list(res))

if __name__ == '__main__':
    main(download_many)


BD DE CN BR IN TR EGRU ID JP VN  NG FR ET MX PK CD PHIR  US 
20 flags downloaded in 0.56s

Experimenting with Executor.map


In [9]:
from time import sleep, strftime
from concurrent import futures

def display(*args):
    print(strftime('[%H:%M:%S]'), end=' ')
    print(*args)
    
def loiter(n):
    msg = '{}loiter({}): doing nothing for {}s...'
    display(msg.format('\t'*n, n, n))
    sleep(n)
    msg = '{}loiter({}): done.'
    display(msg.format('\t'*n, n))
    return n * 10

def main():
    display('Script starting.')
    executor = futures.ThreadPoolExecutor(max_workers=3)
    results = executor.map(loiter, range(5))
    display('results:', results)
    display('Waiting for individual results:')
    for i, result in enumerate(results):
        display('result {}: {}'.format(i, result))
        
main()


[21:42:03] Script starting.
[21:42:03] loiter(0): doing nothing for 0s...
[21:42:03] loiter(0): done.
[21:42:03] 	loiter(1): doing nothing for 1s...
[21:42:03] 		loiter(2): doing nothing for 2s...
[21:42:03] results: <generator object Executor.map.<locals>.result_iterator at 0x0000000C084CECA8>
[21:42:03] Waiting for individual results:
[21:42:03] result 0: 0
[21:42:03] 			loiter(3): doing nothing for 3s...
[21:42:04] 	loiter(1): done.
[21:42:04] 				loiter(4): doing nothing for 4s...
[21:42:04] result 1: 10
[21:42:05] 		loiter(2): done.
[21:42:05] result 2: 20
[21:42:06] 			loiter(3): done.
[21:42:06] result 3: 30
[21:42:08] 				loiter(4): done.
[21:42:08] result 4: 40

Downloads with Progress Display and Error Handling


In [11]:
import time
from tqdm import tqdm
for i in tqdm(range(1000)):
    time.sleep(.01)


100%|██████████████████████████████████████| 1000/1000 [00:10<00:00, 96.02it/s]

In [ ]: